from datasets import load_dataset ,Dataset
import pandas as pd  
from transformers import AutoTokenizer  
from transformers import AutoModelForSequenceClassification   #分类模型
from transformers import TrainingArguments 
from transformers import Trainer
from transformers import pipeline
import torch

def train():
    #  加载新闻分析数据集  
    df = pd.read_csv(
        r'E:\python-knowledge\6项目实战\大模型微调\toutiao_cat_data\toutiao_cat_data.txt',
        sep='_!_',          # 分隔符，默认为逗号
        header=None,         # 指定哪一行作为列名
        # index_col='id',   # 指定索引列
        encoding='utf-8', # 文件编码
        # dtype={'Salary': 'float64'}  # 指定列数据类型
    )

    # 获取前10000行和第4、2列(索引为3和1)
    train_subset = df.iloc[:150000, [3, 1]]
    test_subset = df.iloc[-100:, [3, 1]]

    train_subset.columns = ['text', 'label']    # 重命名列

    test_subset.columns = ['text', 'label']

    for index, row in train_subset.iterrows():
        train_subset.loc[index, 'label'] = int(row['label'])-100    #label 从0开始
    for index, row in test_subset.iterrows():
        test_subset.loc[index, 'label'] = int(row['label'])-100 

    # dataset = load_dataset('imdb')  # 加载IMDb电影评论数据集
    # train_dataset = dataset['train']
    # eval_dataset = dataset['test']
    #print(dataset["train"][0])  # 查看样例, 发现包括包含文本和标签 {'text':"",'label':0}  
    # for index, row in train_subset.iterrows():
    #     print(row)
    #     print('------------------')

    #HuggingFace的datasets库提供了简单的方法来转换DataFrame
    train_dataset = Dataset.from_pandas(train_subset)
    test_dataset = Dataset.from_pandas(test_subset)


    # # 数据集结构  
    print(f"训练集: {len(train_subset)}条, 测试集: {len(test_subset)}条")  
    print("标签分布:\n", train_subset["label"].value_counts())

    # 1. 初始化分词器
    tokenizer = AutoTokenizer.from_pretrained(r"E:\python-knowledge\saved_model")
    # 加载BERT分词器
    def preprocess_function(examples):
        return tokenizer(examples['text'], 
                    truncation=True,       #过长时截断
                    padding='max_length',    #补充到最大token长度，用0
                    max_length=512)     #设置最大长度


    tokenized_train = train_dataset.map(preprocess_function, batched=True)  #批处理，加速
    tokenized_test = test_dataset.map(preprocess_function, batched=True)


    model = AutoModelForSequenceClassification.from_pretrained(
        r"E:\python-knowledge\saved_model",  # 模型名称
        num_labels=18  # 分类标签数量，一般情感分析通常为2(正面/负面)
    )
    if torch.cuda.is_available():
        model.to(torch.device('cuda'))                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         
    else:
        raise Exception("请先安装cuda")


    training_args = TrainingArguments(
        output_dir='./results',          # 输出目录
        num_train_epochs=3,             # 训练轮数
        per_device_train_batch_size=16,  # 训练批次大小
        per_device_eval_batch_size=8,   # 评估批次大小
        evaluation_strategy='epoch',     # 每轮评估
        learning_rate=2e-5,             # 学习率(通常较小)
        weight_decay=0.01,              # 权重衰减
        save_strategy='epoch',          # 保存策略
        logging_steps=100               # 日志记录步数
    )


    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_test,
    )
    trainer.train()

    #保存模型和分词器
    model.save_pretrained('./saved_model2')
    tokenizer.save_pretrained('./saved_model2')


def pred():
    # #验证,推理
    classifier = pipeline('text-classification', 
                        model='./saved_model',
                        tokenizer='./saved_model')


    result = classifier("NBA大牌女明星球迷都有谁？")
    print(result)  # 输出预测标签和置信度



if __name__ == '__main__':
    # train()
    pred()
